# Replication File for
# "Coalition Inclusion Probabilities: A Party-Strategic
# Measure for Predicting Policy and Politics
# Authors: Mark A. Kayser, Matthias Orlowski
# and Jochen Rehmert
# Code to replicate Tables 2, 3, 4, B1 (Appendix), B2 (Appendix), 
# D1 (Appendix) and Table E1 (Appendix), Table F2 (Appendix)
# as well as Figures 2, E1 (Appendix) and E2 (Appendix)

###############################################################################
#                          read and prepare data                              #
###############################################################################

# set your directory to the folder with
# "fun_predCL.R" 
# "fun_getPrIngov.R"
# "getTopThree.R"
# "data_Table2.RDS"

setwd('...')


# load necessary packages 
# some of these packages may need to be installed
library(survival);library(foreign);library(stargazer);require(stringr)
library(splitstackshape);library(flux);library(PRROC);library(ROCR)
library(ggplot2);library(xtable)

# load necessary self-written functions
source("fun_predCL.R")
source("fun_getPrIngov.R")
source("getTopThree.R")


# read data
DT <- readRDS('data_Table2.RDS')

# ---------- data partition ----------#
# Test and training set
set.seed(1104)
# partition dataset while being oblivious to majority situations
trs <- sample(unique(DT$cabinet_id), floor(.8*length(unique(DT$cabinet_id))))
DT$set <- "test"
DT$set[which(DT$cabinet_id %in% trs)] <- "training"

# Generate Interactions
crInteractions <- function(dat){
  
  # single party majority predictor
  dat[["adomp1"]] = (dat[["leg_class"]] == "A")*(dat[["numpar"]] == 1)*(dat[["dompar"]])
  dat[["domp1"]] = (dat[["leg_class"]] != "A")*(dat[["numpar"]] == 1)*(dat[["dompar"]])
  dat[["np1"]] <- as.numeric(dat[["numpar"]] == 1)
  
  dat[["nanumpar"]] = (dat[["leg_class"]] != "A")*(dat[["numpar"]])
  
  dat[["naminwin"]] = (dat[["leg_class"]] != "A")*(dat[["minwin"]])
  dat[["naminor"]] = (dat[["leg_class"]] != "A")*(dat[["minor"]])
  dat[["namginv"]] = (dat[["leg_class"]] != "A")*(dat[["minor"]])*(dat[["inv_rule"]])
  dat[["namedian"]] = (dat[["leg_class"]] != "A")*(dat[["median"]])
  dat[["nasecond"]] = (dat[["leg_class"]] != "A")*(dat[["second"]])
  dat[["nathird"]] = (dat[["leg_class"]] != "A")*(dat[["third"]])
  
  
  dat[["mgodiv1"]] = dat[["odiv1"]]*dat[["minor"]]
  dat[["mgodiff"]] = dat[["odiff"]]*dat[["minor"]]
  dat[["mginvest"]] = dat[["inv_rule"]]*dat[["minor"]]
  
  dat[["confl_sq"]] = dat[["conflict"]]*dat[["sq"]]
  dat[["confl_pm"]] = dat[["conflict"]]*dat[["prevpm"]]	
  dat[["cont_sq"]] = dat[["cont_rule"]]*dat[["sq"]]
  dat[["pe_sq"]] = dat[["post_elc"]]*dat[["sq"]]	
  dat[["chng_pe"]] = dat[["avg_chng"]]*dat[["post_elc"]]	
  dat[["chng_sq"]] = dat[["avg_chng"]]*dat[["sq"]]	
  dat[["chng_pe_sq"]] = dat[["avg_chng"]]*dat[["post_elc"]]*dat[["sq"]]
  
  
  return(dat)	
  
}

DT <- crInteractions(DT)

# ---------- declare test and training data ---------#

# TD is test data without majority situations (i.e., leg_class A)
TD <- DT[which(DT$set == "test" & DT$leg_class != "A"), ]
TD <- na.omit(TD)

# test data containing "majority situations"
TD.maj <- DT[which(DT$set == "test"),]
TD.maj <- na.omit(TD.maj)

# training data set pruned to constant sample for all models
DT2 <- na.omit(DT[which(DT$set == "training"),])


###############################################################################
#                 replication Martin & Stevension 2001, Model 7               #
###############################################################################

#--- formula ---#
f_ms7 <- as.formula(realg ~ minor + minwin + numpar + dompar + median + gdiv1 
                    + mgodiv1 + prevpm + sq + mginvest + anmax + anti_pact + pro_pact 
                    + strata(cabinet_id))

#--- excl. majority situations ---#
m_ms7 <- clogit(f_ms7
                , data = DT2[which(DT2$set == "training" & DT2$leg_class != "A"), ])

#--- incl. majority situations ---#
m_ms7.maj <- clogit(f_ms7
                    , data = DT2[which(DT2$set == "training"),])
summary(m_ms7.maj)

###############################################################################
#                                 Naive Model                                 #
###############################################################################
#--- formula ---#
f_naive <- as.formula(realg ~ dompar + gdiv1 + sq
                      + strata(cabinet_id))

#--- excl. majority situations ---#
m_naive <- clogit(f_naive, data = DT2[which(DT2$set == "training" & DT2$leg_class != "A"), ])

#--- incl. majority situations ---#
m_naive.maj <- clogit(f_naive, data = DT2[which(DT2$set == "training"),])
summary(m_naive.maj)


###############################################################################
#                 replication Martin & Stevension 2010, Model 3               #
###############################################################################

#--- excl. majority situations ---#
m_ms10 <- update(m_ms7, . ~ . + cab_fam + similar + confl_sq + confl_pm # + cont_sq 
                 + pe_sq + avg_chng + chng_pe + chng_sq + chng_pe_sq)

#--- incl. majority situations ---#
m_ms10.maj <- update(m_ms7.maj, . ~ . + cab_fam + similar + confl_sq + confl_pm # + cont_sq 
                     + pe_sq + avg_chng + chng_pe + chng_sq + chng_pe_sq)

###############################################################################
#                        Kayser-Orlowski-Rehmert Model                        #
###############################################################################

#--- excl. majority situations ---#
m_max <- update(m_ms10, . ~ . - cab_fam - anti_pact - pro_pact - gdiv1 - mgodiv1  +  anti + second + third  + cab_hist + diff +mgodiff)

#--- incl. majority situations ---#
m_max.maj <- update(m_ms10.maj, . ~ . - cab_fam  - anti_pact - pro_pact - gdiv1 - mgodiv1 +  anti + second + third  + cab_hist + diff + mgodiff)


###############################################################################
#                                  Table 2                                    #
###############################################################################

stargazer(m_max,
          covariate.labels = c(
            "Minority Government",
            "Minimal Winning Coalition",
            "Number of Parties in Coalition",
            "Largest Party in Coalition (LP)",
            "Median Party in Coalition",
            "Previous PM Party in Coalition (PM)",
            "Status Quo (SQ)",
            "Minority Government with Investiture Requirement",
            "Anti-Establishment Preference in Coalition",
            "Similarity",
            "Intracabinet Conflict $\\times$ SQ",
            "Intracabinet Conflict $\\times$ PM",
            "Postelection Bargaining (PE) $\\times$ SQ",
            "Average Seat Change (SC)",
            "SC $\\times$ PE",
            "SC $\\times$ SQ",
            "SC $\\times$ PE $\\times$ SQ",
            "Anti-System Party in Coalition",
            "Second Largest Party",
            "Third Largest Party",
            "Cabinet History",
            "Ideological Range in Coalition (Party Families)",
            "Ideological Divisions in Majority Opposition (Party Families)"
          ), out = "table2.tex")
# ------------- #


###############################################################################
#                                  Table 3                                    #
###############################################################################


# get predictions for Test Data (TD) with coefficient from m_max
TD$pr_kor <- predCL(m_max, new.dat = TD, strat.var = "cabinet_id")

var <- "pr_kor"

getPr <- function(dat, pred.var = var){
  
  mps <- unlist(strsplit(pred.var, "_"))[2]
  pr_vals <- dat[[pred.var]]
  
  # maximal predicted probability
  dat[[paste("maxpr", mps, sep = "_")]] <- max(pr_vals)
  
  # Difference between highest and second highest predicted probability
  dat[[paste("pr2d", mps, sep = "_")]] <- -diff(sort(pr_vals, decreasing = TRUE))[1]
  
  return(dat)
  
}

TD <- ddply(TD, .(cabinet_id), getPr, pred.var =  var) 

TD[[paste0("pp_",var)]] <- as.numeric(TD[[var]] == TD[[paste0("max",var)]])

# ---------------------------------  Table 3 --------------------------------- #

t3 <- table(TD[!is.na(TD[[var]]), paste0("pp_",var)], TD[!is.na(TD[[var]]), "realg"])
t3.prec <- round(t3[2,2]/(t3[1,2] + t3[2,2]),2)
t3.rec <- round(t3[2,2]/(t3[2,1] + t3[2,2]),2)

fconf_mat <- matrix(ncol = 2, nrow = 6)
fconf_mat[c(1,3), c(1, 2)] <- t3
ctab <- prop.table(t3, margin = 2)
fconf_mat[c(2,4), c(1, 2)] <- round(ctab,3)
fconf_mat[5, 1] <- t3.prec
fconf_mat[6, 1] <- t3.rec

row.names(fconf_mat) <- c(expression(hat("not realized")),"", expression(hat("realized")),"",
                          "Precision","Recall"  )
colnames(fconf_mat) <- c("not realized","realized")

 stargazer(fconf_mat  , digits = 3    
           , out = "table3.tex"
            , out.header = FALSE)
# ---------------------------------------------------------------------------- #



###############################################################################
#                                  Table B1                                    #
###############################################################################


# ------ Table B1 in the Appendix ------ #
stargazer(m_naive, m_ms7, m_ms10, m_max,
          out = "tableB1.tex")          #
# --------------------------------------#

###############################################################################
#                                  Table 4                                    #
###############################################################################

#--- formula ---#
f_nmod <- as.formula(realg ~ dompar + np1 + adomp1 + domp1
                     + naminor + namginv 
                     + naminwin + nanumpar + namedian 
                     + diff 
                     + sq 
                     + anti
                     + nasecond + nathird 
                     + cab_hist 
                     + strata(cabinet_id))


#--- incl. majority situations ---#
m_nmod <- clogit(f_nmod, data = DT2[which(DT2$set == "training" ), ]) 


# --- Table 4 --- #
stargazer(m_nmod,
          covariate.labels = c(
            "Largest Party in Coalition",
            "Single Party Government",
            "Largest Party $\\times$ Single Party Gov't",
            "No-Majority Situation $\\times$ Largest Party $\\times$ Single Party Gov't",
            "No-Majority $\\times$ Minority Government",
            "No-Majority $\\times$ Minority Gov't $\\times$ Investiture Vote",
            "No-Majority $\\times$ Minimal Winning Coalition",
            "No-Majority $\\times$ Number of Parties in Coalition",
            "No-Majority $\\times$ Median Party in Coalition",
            "Ideological Range in Coalition",
            "Status Quo",
            "Anti-Establishment Party in Coalition",
            "No-Majority $\\times$ Second Largest Party",
            "No-Majority $\\times$ Third Largest Party",
            "Cabinet History"
          ), out = "table4.tex") 
# --------------- #



###############################################################################
#                        Predictions for Model Comparison                     #
###############################################################################

# out-sample predictions 
TD$pr_naive <- predCL(m_naive, new.dat = TD, strat.var = "cabinet_id")
TD$pr_ms01 <- predCL(m_ms7, new.dat = TD, strat.var = "cabinet_id")
TD$pr_ms10 <- predCL(m_ms10, new.dat = TD, strat.var = "cabinet_id")
TD$pr_kor <- predCL(m_max, new.dat = TD, strat.var = "cabinet_id")
TD$pr_korp <- predCL(m_nmod, new.dat = TD, strat.var = "cabinet_id")

TD$pr_maj_naive <- predCL(m_naive.maj, new.dat = TD, strat.var = "cabinet_id")
TD$pr_maj_ms01 <- predCL(m_ms7.maj, new.dat = TD, strat.var = "cabinet_id")
TD$pr_maj_ms10 <- predCL(m_ms10.maj, new.dat = TD, strat.var = "cabinet_id")
TD$pr_maj_kor <- predCL(m_max.maj, new.dat = TD, strat.var = "cabinet_id")
TD$pr_maj_korp <- predCL(m_nmod, new.dat = TD, strat.var = "cabinet_id")

TD.maj$pr_naive <- predCL(m_naive, new.dat = TD.maj, strat.var = "cabinet_id")
TD.maj$pr_ms01 <- predCL(m_ms7, new.dat = TD.maj, strat.var = "cabinet_id")
TD.maj$pr_ms10 <- predCL(m_ms10, new.dat = TD.maj, strat.var = "cabinet_id")
TD.maj$pr_kor <- predCL(m_max, new.dat = TD.maj, strat.var = "cabinet_id")
TD.maj$pr_korp <- predCL(m_nmod, new.dat = TD.maj, strat.var = "cabinet_id")

TD.maj$pr_maj_naive <- predCL(m_naive.maj, new.dat = TD.maj, strat.var = "cabinet_id")
TD.maj$pr_maj_ms01 <- predCL(m_ms7.maj, new.dat = TD.maj, strat.var = "cabinet_id")
TD.maj$pr_maj_ms10 <- predCL(m_ms10.maj, new.dat = TD.maj, strat.var = "cabinet_id")
TD.maj$pr_maj_kor <- predCL(m_max.maj, new.dat = TD.maj, strat.var = "cabinet_id")
TD.maj$pr_maj_korp <- predCL(m_nmod, new.dat = TD.maj, strat.var = "cabinet_id")


# in-sample predictions...based on models estimated on sample excl. majority situations
DT2$pr_naive[DT2$set == "training"] <- predCL(m_naive, new.dat = DT2[which(DT2$set == "training"), ], strat.var = "cabinet_id")
DT2$pr_ms01[DT2$set == "training"] <- predCL(m_ms7, new.dat = DT2[which(DT2$set == "training"), ],  strat.var = "cabinet_id")
DT2$pr_ms10[DT2$set == "training"] <- predCL(m_ms10, new.dat = DT2[which(DT2$set == "training"), ],   strat.var = "cabinet_id")
DT2$pr_kor[DT2$set == "training"] <- predCL(m_max,new.dat = DT2[which(DT2$set == "training"), ], strat.var = "cabinet_id")
DT2$pr_korp[DT2$set == "training"] <- predCL(m_nmod, new.dat = DT2[which(DT2$set == "training"), ], strat.var = "cabinet_id")

# in-sample predictions...based on models estimated on sample incl. majority situations
DT2$pr_maj_naive[DT2$set == "training"] <- predCL(m_naive.maj, new.dat = DT2[which(DT2$set == "training"), ], strat.var = "cabinet_id")
DT2$pr_maj_ms01[DT2$set == "training"] <- predCL(m_ms7.maj, new.dat = DT2[which(DT2$set == "training"), ], strat.var = "cabinet_id")
DT2$pr_maj_ms10[DT2$set == "training"] <- predCL(m_ms10.maj, new.dat = DT2[which(DT2$set == "training"), ], strat.var = "cabinet_id")
DT2$pr_maj_kor[DT2$set == "training"] <- predCL(m_max.maj, new.dat = DT2[which(DT2$set == "training"), ], strat.var = "cabinet_id")
DT2$pr_maj_korp[DT2$set == "training"] <- predCL(m_nmod, new.dat = DT2[which(DT2$set == "training"), ], strat.var = "cabinet_id")


###############################################################################
#                                  Table B2                                   #
###############################################################################

getPr <- function(dat, pred.var = var){
  
  mps <- unlist(strsplit(pred.var, "_"))[2]
  pr_vals <- dat[[pred.var]]
  
  # maximal predicted probability
  dat[[paste("maxpr", mps, sep = "_")]] <- max(pr_vals)
  
  # Difference between highest and second highest predicted probability
  dat[[paste("pr2d", mps, sep = "_")]] <- -diff(sort(pr_vals, decreasing = TRUE))[1]
  
  return(dat)
  
}
for(i in 1:4){
  var = c("pr_naive", "pr_ms01", "pr_ms10","pr_kor")[i]
  ### Confusion matrix
  
  TDA <- ddply(TD, .(cabinet_id), getPr, pred.var =  var) 
  
  head(TDA)
  TDA[[paste0("pp_",var)]] <- as.numeric(TDA[[var]] == TDA[[paste0("max", var)]])
  
  #TDA.maj[["pp_pr_korp"]] <- as.numeric(TDA.maj[[var]] == TDA.maj[["maxpr_korp"]])
  
  fconf_mat <- matrix(ncol = 2, nrow = 6)
  
  # make point predictions
  tab <- table(TDA[!is.na(TDA$pr_ms01), paste0("pp_",var)], TDA[!is.na(TDA$pr_ms01), "realg"])
  fconf_mat[c(1,3), c(1, 2)] <- tab
  ctab <- prop.table(tab, margin = 2)
  fconf_mat[c(2,4), c(1, 2)] <- ctab
  fconf_mat[5, 1] <- round(tab[2,2]/(tab[2,2]+tab[2,1]),2)
  fconf_mat[6, 1] <- round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
  
  stargazer(fconf_mat
            , digits = 3
            , out = paste0("tableB2_",var,".tex")
            , out.header = FALSE)
  
}


###############################################################################
#                                  Table E1                                   #
###############################################################################
# code to reproduce figures of Table E1 in the Appendix
# this code only provides the infrastructure to reproduce 
# the number and figures presented in Table E1, but
# not in a table layout

require(splitstackshape)

out.e1 <- as.data.frame(matrix(ncol = 6, nrow = 13))
## iterate through the 5 values for "var" given below 
## to calculate the values for each of the 5 models
# choose a column containing one of the predictions based on the six models
for(ii in 1:5){
var <- c("pr_maj_naive","pr_maj_ms01","pr_maj_ms10","pr_maj_kor","pr_maj_korp")[ii]

# subset the data
dat <- TD.maj[order(TD.maj[["cabinet_id"]], -TD.maj[[var]]),]

# identify the predicted coalition
dat$rank <- ave(dat[[var]], dat$cabinet_id, FUN = function(x) rank(-x, ties.method = "first"))

# get the formation opportunity (cabinet_id) and the 
# composition of the true coalition 
cab_index <- unique(dat$cabinet_id)
true_gov <- as.character(dat$cab_comp[dat$realg == 1])
dat$true_coal <- true_gov[match(dat$cabinet_id, cab_index)]

# get information on the (falsly) predicted coalition
false_one <- dat[dat$rank == 1 & dat$realg == 0,c("cabinet_id","cab_comp","cab_seats","realg",
                                                  "numpar","country_name","election_date",
                                                  "rank","true_coal",var)]
# count the number of parties in the true and predicted coalitions
false_one$numpar_coal <- (str_count(false_one$cab_comp, ";") + 1)
false_one$numpar_true <- (str_count(false_one$true_coal, ";") + 1)
false_one$cab_comp <- as.character(false_one$cab_comp)
false_one$true_coal <- as.character(false_one$true_coal)

coal_ptys <- as.list(false_one$cab_comp)
true_ptys <- as.list(false_one$true_coal)

false_one$coal_superset = 0
false_one$coal_subset = 0
false_one$pty_diff = 0

for(i in 1:nrow(false_one)){
  
  tmp <- unlist(str_split(coal_ptys[[i]], "; ")) %in% unlist(str_split(true_ptys[[i]], "; "))
  n_coal  <- length(unlist(str_split(coal_ptys[[i]], "; "))) 
  n_true <- length(unlist(str_split(true_ptys[[i]], ";")))
  n_diff <- abs(n_coal - n_true)  
  count_true <- if(n_true  > n_coal){n_true - as.numeric(as.character(table(tmp)["TRUE"]))}else{n_coal - as.numeric(as.character(table(tmp)["TRUE"]))}
  
  
  if((n_coal > n_true) && (tmp == TRUE)){false_one[i,"coal_superset"] <- 1}
  if((n_true > n_coal) && (tmp == TRUE)){false_one[i, "coal_subset"] <- 1}
  
  false_one[i, "pty_diff"] <- count_true
  
}

no.super <- sum(as.numeric(as.character(false_one[,13])), na.rm = TRUE) # no. of supersets
avg.pty.diff <- round(mean(as.numeric(as.character(false_one[,15])), na.rm = TRUE),3) #avg. party difference
no.sub <- sum(as.numeric(as.character(false_one[,14])), na.rm = TRUE) # no of subsets
no.false <- length(which(is.na(as.character(false_one[,15])))) # no. of completed false coalitions
no.other <- nrow(false_one)-no.super -no.sub-no.false
# ----------------------------------- #
# top five coalitions                 # 
# out-sample with majority situations #
# ----------------------------------- #

  top3 <- getTopThree(dat = TD.maj, group.var = "cabinet_id", preds.var = var, true.coal = "realg")
  f1 = top3$false_first
  f2 = top3$false_second
  f3 = top3$false_third
  f4 = top3$false_fourth
  f5 = top3$false_fifth
  t1 = top3$true_first
  t2 = top3$true_second 
  t3 = top3$true_third
  t4 = top3$true_forth
  t5 = top3$true_fifth
  
  out.e1[1,ii+1] <- f1
  out.e1[2,ii+1] <- no.false
  out.e1[3,ii+1] <- no.super
  out.e1[4,ii+1] <- no.sub
  out.e1[5,ii+1] <- no.other
  out.e1[6,ii+1] <- avg.pty.diff
  out.e1[7,ii+1] <- NA
  out.e1[8,ii+1] <- t1
  out.e1[9,ii+1] <- t2
  out.e1[10,ii+1] <- t3
  out.e1[11,ii+1] <- t4
  out.e1[12,ii+1] <- t5
  out.e1[13,ii+1] <- (t1+t2+t3+t4+t5)
  
}


out.e1[1,1] <- "False Positives"
out.e1[2,1] <- "Completely false composition"
out.e1[3,1] <- "Superset of True Coalition"
out.e1[4,1] <- "Subset of True Coalition"
out.e1[5,1] <- "Other"
out.e1[6,1] <- "Mean difference of parties in true and predicted coalition"
out.e1[7,1] <- "Predicted Rank of True Coalition"
out.e1[8,1] <- "1st Rank (=True Positives)"
out.e1[9,1] <- "2nd Rank"
out.e1[10,1] <- "3rd Rank"
out.e1[11,1] <- "4th Rank"
out.e1[12,1] <- "5th Rank"
out.e1[13,1] <- "True coalitions among top 5 predicted coalitions"


colnames(out.e1) <- c("","Naive Model", "MS 2001", "MS 2010","New","New par.")

print(xtable(out.e1), file = "tableE1.tex")

###############################################################################
#                                 Figure E1                                   #
###############################################################################

require(PRROC)
require(ggplot2)
require(ROCR)

# create empty data.frames to be filled with information
df.in = data.frame()
df.out = data.frame()
df.maj = data.frame()
model.names <- c("Naive Model","MS 2001","MS 2010", "KOR", "KOR par.")
# loop through the different predicted values based on the different models
for(i in 1:5){
  var = c("pr_maj_naive", "pr_maj_ms01", "pr_maj_ms10","pr_maj_kor","pr_maj_korp")[i]
  
  # prepare data for pr.curve function from the PRROC package
  prob1.out.maj <- TD.maj[[var]][TD.maj$realg == 1 & !is.na(TD.maj[[var]])]
  prob0.out.maj <- TD.maj[[var]][TD.maj$realg == 0 & !is.na(TD.maj[[var]])]  
  
  prob1.in.maj <- DT2[[var]][DT2$set == "training" & DT2$realg == 1 ]
  prob0.in.maj <- DT2[[var]][DT2$set == "training" & DT2$realg == 0 ]
  
  pr.out.maj <- pr.curve(scores.class0 = prob1.out.maj, scores.class1 = prob0.out.maj, curve = FALSE)
  pr.in.maj <- pr.curve(scores.class0 = prob1.in.maj, scores.class1 = prob0.in.maj, curve = FALSE)
  
  aupr.out.maj <- pr.out.maj$auc.integral
  aupr.in.maj <- pr.in.maj$auc.integral
  
  # functions from ROCR package
  rocr_preds.out.maj = prediction(TD.maj[[var]], TD.maj$realg)
  rocr_preds.in.maj = prediction(DT2[[var]], DT2$realg)
  
  # get figures on Recall and Precision
  roc_list.out.maj <- performance(rocr_preds.out.maj, "rec", "prec")
  roc_list.in.maj <- performance(rocr_preds.in.maj, "rec", "prec")
  
  tmp1 = cbind(precision = data.frame(roc_list.in.maj@x.values), 
               recall = data.frame(roc_list.in.maj@y.values), 
               aupr = aupr.in.maj,
               model = model.names[i])
  colnames(tmp1) <- c("precision", "recall", "aupr", "model")
  df.in = rbind(df.in, tmp1)
  tmp2 = cbind(precision = data.frame(roc_list.out.maj@x.values), 
               recall = data.frame(roc_list.out.maj@y.values), 
               aupr = aupr.out.maj,
               model =  model.names[i])
  colnames(tmp2) <- c("precision", "recall", "aupr", "model")
  df.maj = rbind(df.maj, tmp2)
  
  
}

# prepare data for gglot function

aupr.maj <- round(unique(df.maj$aupr),2)
aupr.in <- round(unique(df.in$aupr),2)

# create data frame for plotting
ggData = rbind(cbind(df.in, sample = "In-Sample"),
               cbind(df.maj, sample = "Out-Sample"))

colnames(ggData) = c("precision" ,"recall"  ,  "aupr"    ,  "model"  , "sample")
levels(ggData$model)
ggData$model_f = factor(ggData$model, levels=c('Naive Model','MS 2001','MS 2010','KOR', 'KOR par.'))

# get annotated text into right format
aupr.in <- as.character(aupr.in)
aupr.maj <- as.character(aupr.maj)

# add a trailing 0 to AUPR figures of only 3 characters length
for(i in 1:length(aupr.in)){if(nchar(aupr.in[i]) == 3){aupr.in[i] <- paste0(aupr.in[i], "0")}}

aupr.list = c(aupr.in, aupr.maj) 
aupr.text = data.frame(paste0("AUPR = \n", aupr.list))
aupr.text$sample <- c(rep("In-Sample", 5), rep("Out-Sample",5))
aupr.text$model <- rep(c("Naive Model","MS 2001", "MS 2010", "KOR", "KOR par."),2)
colnames(aupr.text)[1] <- "label"
aupr.text$model_f = factor(aupr.text$model, levels=c('Naive Model','MS 2001','MS 2010','KOR', 'KOR par.'))

# --------------- Figure E1 --------------- #
pdf("figureE1.pdf")
ggplot(ggData, aes(recall, precision)) + 
  geom_line(size = 1, alpha = 0.7) +
  labs(x = "Recall", y = "Precision")  +   
  geom_text(aes(.2, .2, label= label, group=NULL ), 
            color="black", data = aupr.text) +
  facet_grid(model_f~sample)  + theme_bw() 
dev.off()
# ----------------------------------------- #


###############################################################################
#                                 Figure E2                                   #
###############################################################################

############################################
#         PARTY LEVEL PERFORMANCE          #
############################################

############################################
#  IN - SAMPLE, INCL. MAJORITY SITUATIONS  #
############################################

cabs.in = unique(DT2$cabinet_id)
count.in = length(cabs.in)

pty_ingov_naive.in = data.frame(matrix(nrow = 0, ncol = 7))
pty_ingov_ms01.in = data.frame(matrix(nrow = 0, ncol = 7))
pty_ingov_ms10.in = data.frame(matrix(nrow = 0, ncol = 7))
pty_ingov_kor.in = data.frame(matrix(nrow = 0, ncol = 7))
pty_ingov_korp.in = data.frame(matrix(nrow = 0, ncol = 7))

colnames(pty_ingov_naive.in) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")
colnames(pty_ingov_ms01.in) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")
colnames(pty_ingov_ms10.in) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")
colnames(pty_ingov_kor.in) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")
colnames(pty_ingov_korp.in) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")

n = 0
for(ii in cabs.in){
  n = n + 1
  print(count.in - n)
  cty_name <- DT2$country_name[DT2$cabinet_id == ii][1]
  #year <-  as.numeric(format(DT2$start_date[DT2$cabinet_id == ii][1], '%Y'))
  year <- as.character(DT2$start_date[DT2$cabinet_id == ii][1])
  true.gov <- as.character(DT2$cab_comp[DT2$cabinet_id == ii & DT2$realg == 1])
  cabinet_id <- ii
  # for pty.sep include one tab white space after separater
  naive <- getPrIngov(DT2[DT2$cabinet_id == ii,], prob.var = "pr_maj_naive", comp.var = "cab_comp", pty.sep = "; ")
  ms01 <- getPrIngov(DT2[DT2$cabinet_id == ii,], prob.var = "pr_maj_ms01", comp.var = "cab_comp", pty.sep = "; ")
  ms10 <- getPrIngov(DT2[DT2$cabinet_id == ii,], prob.var = "pr_maj_ms10", comp.var = "cab_comp", pty.sep = "; ")
  kor <- getPrIngov(DT2[DT2$cabinet_id == ii,], prob.var = "pr_maj_kor", comp.var = "cab_comp", pty.sep = "; ")
  kor.par <- getPrIngov(DT2[DT2$cabinet_id == ii,], prob.var = "pr_maj_korp", comp.var = "cab_comp", pty.sep = "; ")
   
  naive$country <- cty_name
  naive$year <- year
  naive$true_gov <- true.gov
  naive$cabinet_id <- cabinet_id
  naive$in_gov <- as.numeric(naive$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  ms01$country <- cty_name
  ms01$year <- year
  ms01$true_gov <- true.gov
  ms01$cabinet_id <- cabinet_id
  ms01$in_gov <- as.numeric(ms01$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  ms10$country <- cty_name
  ms10$year <- year
  ms10$true_gov <- true.gov
  ms10$cabinet_id <- cabinet_id
  ms10$in_gov <- as.numeric(ms10$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  kor$country <- cty_name
  kor$year <- year
  kor$true_gov <- true.gov
  kor$cabinet_id <- cabinet_id
  kor$in_gov <- as.numeric(kor$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  kor.par$country <- cty_name
  kor.par$year <- year
  kor.par$true_gov <- true.gov
  kor.par$cabinet_id <- cabinet_id
  kor.par$in_gov <- as.numeric(kor.par$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  pty_ingov_naive.in <- rbind(pty_ingov_naive.in, naive)
  pty_ingov_ms01.in <- rbind(pty_ingov_ms01.in, ms01)
  pty_ingov_ms10.in <- rbind(pty_ingov_ms10.in, ms10)
  pty_ingov_kor.in <- rbind(pty_ingov_kor.in, kor)
  pty_ingov_korp.in <- rbind(pty_ingov_korp.in, kor.par)
}

############################################
# OUT OF SAMPLE, INCL. MAJORITY SITUATIONS #
############################################

cabs.maj = unique(TD.maj$cabinet_id)
count.maj = length(cabs.maj)

pty_ingov_naive.maj = data.frame(matrix(nrow = 0, ncol = 7))
pty_ingov_ms01.maj = data.frame(matrix(nrow = 0, ncol = 7))
pty_ingov_ms10.maj = data.frame(matrix(nrow = 0, ncol = 7))
pty_ingov_kor.maj = data.frame(matrix(nrow = 0, ncol = 7))
pty_ingov_korp.maj = data.frame(matrix(nrow = 0, ncol = 7))

colnames(pty_ingov_naive.maj) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")
colnames(pty_ingov_ms01.maj) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")
colnames(pty_ingov_ms10.maj) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")
colnames(pty_ingov_kor.maj) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")
colnames(pty_ingov_korp.maj) <- c("party_abbr", "pr_ingov", "country", "year", "true_gov" , "cabinet_id",  "in_gov")


n = 0
for(ii in cabs.maj){
  n = n + 1
  print(count.maj - n)
  cty_name <- as.character(TD.maj$country_name[TD.maj$cabinet_id == ii][1])
  #year <-  as.numeric(format(TD.maj$start_date[TD.maj$cabinet_id == ii][1], '%Y'))
  year <- as.character(TD.maj$start_date[TD.maj$cabinet_id == ii][1])
  true.gov <- as.character(TD.maj$cab_comp[TD.maj$cabinet_id == ii & TD.maj$realg == 1])
  cabinet_id <- ii
  # for pty.sep include one tab white space after separater
  naive <- getPrIngov(TD.maj[TD.maj$cabinet_id == ii,], prob.var = "pr_maj_naive", comp.var = "cab_comp", pty.sep = "; ")
  ms01 <- getPrIngov(TD.maj[TD.maj$cabinet_id == ii,], prob.var = "pr_maj_ms01", comp.var = "cab_comp", pty.sep = "; ")
  ms10 <- getPrIngov(TD.maj[TD.maj$cabinet_id == ii,], prob.var = "pr_maj_ms10", comp.var = "cab_comp", pty.sep = "; ")
  kor <- getPrIngov(TD.maj[TD.maj$cabinet_id == ii,], prob.var = "pr_maj_kor", comp.var = "cab_comp", pty.sep = "; ")
  kor.par <- getPrIngov(TD.maj[TD.maj$cabinet_id == ii,], prob.var = "pr_maj_korp", comp.var = "cab_comp", pty.sep = "; ")
  
  naive$country <- cty_name
  naive$year <- year
  naive$true_gov <- true.gov
  naive$cabinet_id <- cabinet_id
  naive$in_gov <- as.numeric(naive$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  ms01$country <- cty_name
  ms01$year <- year
  ms01$true_gov <- true.gov
  ms01$cabinet_id <- cabinet_id
  ms01$in_gov <- as.numeric(ms01$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  ms10$country <- cty_name
  ms10$year <- year
  ms10$true_gov <- true.gov
  ms10$cabinet_id <- cabinet_id
  ms10$in_gov <- as.numeric(ms10$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  kor$country <- cty_name
  kor$year <- year
  kor$true_gov <- true.gov
  kor$cabinet_id <- cabinet_id
  kor$in_gov <- as.numeric(kor$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  kor.par$country <- cty_name
  kor.par$year <- year
  kor.par$true_gov <- true.gov
  kor.par$cabinet_id <- cabinet_id
  kor.par$in_gov <- as.numeric(kor.par$party_abbr %in% unlist(strsplit(true.gov, "; ")))
  
  
  pty_ingov_naive.maj <- rbind(pty_ingov_naive.maj, naive)
  pty_ingov_ms01.maj <- rbind(pty_ingov_ms01.maj, ms01)
  pty_ingov_ms10.maj <- rbind(pty_ingov_ms10.maj, ms10)
  pty_ingov_kor.maj <- rbind(pty_ingov_kor.maj, kor)
  pty_ingov_korp.maj <- rbind(pty_ingov_korp.maj, kor.par)
  
}

# lower part of Table D1 
table(as.numeric(pty_ingov_naive.maj$pr_ingov >.5), pty_ingov_naive.maj$in_gov)
table(as.numeric(pty_ingov_ms01.maj$pr_ingov >.5), pty_ingov_ms01.maj$in_gov)
table(as.numeric(pty_ingov_ms10.maj$pr_ingov >.5), pty_ingov_ms10.maj$in_gov)
table(as.numeric(pty_ingov_kor.maj$pr_ingov >.5), pty_ingov_kor.maj$in_gov)
table(as.numeric(pty_ingov_korp.maj$pr_ingov >.5), pty_ingov_korp.maj$in_gov)

############################################
#              AUCPR Plots                 #
############################################
require(PRROC)
require(ggplot2)
require(ROCR)

prob.in.naive.0 <- pty_ingov_naive.in[["pr_ingov"]][pty_ingov_naive.in$in_gov == 0]
prob.in.naive.1 <- pty_ingov_naive.in[["pr_ingov"]][pty_ingov_naive.in$in_gov == 1]
prob.in.ms01.0 <- pty_ingov_ms01.in[["pr_ingov"]][pty_ingov_ms01.in$in_gov == 0] 
prob.in.ms01.1 <- pty_ingov_ms01.in[["pr_ingov"]][pty_ingov_ms01.in$in_gov == 1]
prob.in.ms10.0 <- pty_ingov_ms10.in[["pr_ingov"]][pty_ingov_ms10.in$in_gov== 0]
prob.in.ms10.1 <- pty_ingov_ms10.in[["pr_ingov"]][pty_ingov_ms10.in$in_gov == 1]
prob.in.kor.0 <- pty_ingov_kor.in[["pr_ingov"]][pty_ingov_kor.in$in_gov == 0]
prob.in.kor.1 <- pty_ingov_kor.in[["pr_ingov"]][pty_ingov_kor.in$in_gov == 1]
prob.in.korp.0 <- pty_ingov_korp.in[["pr_ingov"]][pty_ingov_korp.in$in_gov == 0]
prob.in.korp.1 <- pty_ingov_korp.in[["pr_ingov"]][pty_ingov_korp.in$in_gov == 1]

prob.maj.naive.0 <- pty_ingov_naive.maj[["pr_ingov"]][pty_ingov_naive.maj$in_gov == 0]
prob.maj.naive.1 <- pty_ingov_naive.maj[["pr_ingov"]][pty_ingov_naive.maj$in_gov == 1]
prob.maj.ms01.0 <- pty_ingov_ms01.maj[["pr_ingov"]][pty_ingov_ms01.maj$in_gov == 0] 
prob.maj.ms01.1 <- pty_ingov_ms01.maj[["pr_ingov"]][pty_ingov_ms01.maj$in_gov == 1]
prob.maj.ms10.0 <- pty_ingov_ms10.maj[["pr_ingov"]][pty_ingov_ms10.maj$in_gov == 0]
prob.maj.ms10.1 <- pty_ingov_ms10.maj[["pr_ingov"]][pty_ingov_ms10.maj$in_gov == 1]
prob.maj.kor.0 <- pty_ingov_kor.maj[["pr_ingov"]][pty_ingov_kor.maj$in_gov == 0]
prob.maj.kor.1 <- pty_ingov_kor.maj[["pr_ingov"]][pty_ingov_kor.maj$in_gov == 1]
prob.maj.korp.0 <- pty_ingov_korp.maj[["pr_ingov"]][pty_ingov_korp.maj$in_gov == 0]
prob.maj.korp.1 <- pty_ingov_korp.maj[["pr_ingov"]][pty_ingov_korp.maj$in_gov == 1]

pr.naive.in <- pr.curve(scores.class0 = prob.in.naive.1, scores.class1 = prob.in.naive.0, curve = FALSE)
pr.ms01.in <- pr.curve(scores.class0 = prob.in.ms01.1, scores.class1 = prob.in.ms01.0, curve = FALSE)
pr.ms10.in <- pr.curve(scores.class0 = prob.in.ms10.1, scores.class1 = prob.in.ms10.0, curve = FALSE)
pr.kor.in <- pr.curve(scores.class0 = prob.in.kor.1, scores.class1 = prob.in.kor.0, curve = FALSE)
pr.korp.in <- pr.curve(scores.class0 = prob.in.korp.1, scores.class1 = prob.in.korp.0, curve = FALSE)

aupr.in.naive <- pr.naive.in$auc.integral
aupr.in.ms01 <- pr.ms01.in$auc.integral
aupr.in.ms10 <- pr.ms10.in$auc.integral
aupr.in.kor <- pr.kor.in$auc.integral
aupr.in.korp <- pr.korp.in$auc.integral

pr.naive.maj <- pr.curve(scores.class0 = prob.maj.naive.1, scores.class1 = prob.maj.naive.0, curve = FALSE)
pr.ms01.maj <- pr.curve(scores.class0 = prob.maj.ms01.1, scores.class1 = prob.maj.ms01.0, curve = FALSE)
pr.ms10.maj <- pr.curve(scores.class0 = prob.maj.ms10.1, scores.class1 = prob.maj.ms10.0, curve = FALSE)
pr.kor.maj <- pr.curve(scores.class0 = prob.maj.kor.1, scores.class1 = prob.maj.kor.0, curve = FALSE)
pr.korp.maj <- pr.curve(scores.class0 = prob.maj.korp.1, scores.class1 = prob.maj.korp.0, curve = FALSE)

aupr.maj.naive <- pr.naive.maj$auc.integral
aupr.maj.ms01 <- pr.ms01.maj$auc.integral
aupr.maj.ms10 <- pr.ms10.maj$auc.integral
aupr.maj.kor <- pr.kor.maj$auc.integral
aupr.maj.korp <- pr.korp.maj$auc.integral

rocr.preds.in.naive = prediction(pty_ingov_naive.in[["pr_ingov"]], pty_ingov_naive.in$in_gov)
rocr.preds.in.ms01 = prediction(pty_ingov_ms01.in[["pr_ingov"]], pty_ingov_ms01.in$in_gov)
rocr.preds.in.ms10 = prediction(pty_ingov_ms10.in[["pr_ingov"]], pty_ingov_ms10.in$in_gov)
rocr.preds.in.kor = prediction(pty_ingov_kor.in[["pr_ingov"]], pty_ingov_kor.in$in_gov)
rocr.preds.in.korp = prediction(pty_ingov_korp.in[["pr_ingov"]], pty_ingov_korp.in$in_gov)

rocr.preds.maj.naive = prediction(pty_ingov_naive.maj[["pr_ingov"]], pty_ingov_naive.maj$in_gov)
rocr.preds.maj.ms01 = prediction(pty_ingov_ms01.maj[["pr_ingov"]], pty_ingov_ms01.maj$in_gov)
rocr.preds.maj.ms10 = prediction(pty_ingov_ms10.maj[["pr_ingov"]], pty_ingov_ms10.maj$in_gov)
rocr.preds.maj.kor = prediction(pty_ingov_kor.maj[["pr_ingov"]], pty_ingov_kor.maj$in_gov)
rocr.preds.maj.korp = prediction(pty_ingov_korp.maj[["pr_ingov"]], pty_ingov_korp.maj$in_gov)

rocr.perf.in.naive <- performance(rocr.preds.in.naive, "rec", "prec")
rocr.perf.in.ms01 <- performance(rocr.preds.in.ms01, "rec", "prec")
rocr.perf.in.ms10 <- performance(rocr.preds.in.ms10, "rec", "prec")
rocr.perf.in.kor <- performance(rocr.preds.in.kor, "rec", "prec")
rocr.perf.in.korp <- performance(rocr.preds.in.korp, "rec", "prec")

rocr.perf.maj.naive <- performance(rocr.preds.maj.naive, "rec", "prec")
rocr.perf.maj.ms01 <- performance(rocr.preds.maj.ms01, "rec", "prec")
rocr.perf.maj.ms10 <- performance(rocr.preds.maj.ms10, "rec", "prec")
rocr.perf.maj.kor <- performance(rocr.preds.maj.kor, "rec", "prec")
rocr.perf.maj.korp <- performance(rocr.preds.maj.korp, "rec", "prec")

df.in.naive <- cbind(precision = data.frame(rocr.perf.in.naive@x.values),
                     recall = data.frame(rocr.perf.in.naive@y.values), aupr = aupr.in.naive, model = "Naive Model", sample = "In-Sample")
df.in.ms01 <-  cbind(precision = data.frame(rocr.perf.in.ms01@x.values),
                     recall = data.frame(rocr.perf.in.ms01@y.values), aupr = aupr.in.ms01, model = "MS 2001", sample = "In-Sample")
df.in.ms10 <-  cbind(precision = data.frame(rocr.perf.in.ms10@x.values),
                     recall = data.frame(rocr.perf.in.ms10@y.values), aupr = aupr.in.ms10, model = "MS 2010", sample = "In-Sample")
df.in.kor <-   cbind(precision = data.frame(rocr.perf.in.kor@x.values),
                     recall = data.frame(rocr.perf.in.kor@y.values), aupr = aupr.in.kor, model = "KOR", sample = "In-Sample")
df.in.korp <-  cbind(precision = data.frame(rocr.perf.in.korp@x.values),
                     recall = data.frame(rocr.perf.in.korp@y.values), aupr = aupr.in.korp, model = "KOR par.", sample = "In-Sample")

colnames(df.in.naive) = c("precision", "recall", "aupr", "model","sample")
colnames(df.in.ms01) = c("precision", "recall", "aupr", "model","sample")
colnames(df.in.ms10) = c("precision", "recall", "aupr", "model","sample")
colnames(df.in.kor) = c("precision", "recall", "aupr", "model","sample")
colnames(df.in.korp) = c("precision", "recall", "aupr", "model","sample")
gg.in = rbind(df.in.naive, df.in.ms01, df.in.ms10, df.in.kor, df.in.korp)

df.maj.naive <- cbind(precision = data.frame(rocr.perf.maj.naive@x.values),
                      recall = data.frame(rocr.perf.maj.naive@y.values), aupr = aupr.maj.naive, model = "Naive Model", sample = "Out-Sample")
df.maj.ms01 <-  cbind(precision = data.frame(rocr.perf.maj.ms01@x.values),
                      recall = data.frame(rocr.perf.maj.ms01@y.values), aupr = aupr.maj.ms01, model = "MS 2001", sample = "Out-Sample")
df.maj.ms10 <-  cbind(precision = data.frame(rocr.perf.maj.ms10@x.values),
                      recall = data.frame(rocr.perf.maj.ms10@y.values), aupr = aupr.maj.ms10, model = "MS 2010", sample = "Out-Sample")
df.maj.kor <-  cbind(precision = data.frame(rocr.perf.maj.kor@x.values),
                     recall = data.frame(rocr.perf.maj.kor@y.values), aupr = aupr.maj.kor, model = "KOR", sample = "Out-Sample")
df.maj.korp <-cbind(precision = data.frame(rocr.perf.maj.korp@x.values),
                    recall = data.frame(rocr.perf.maj.korp@y.values), aupr = aupr.maj.korp, model = "KOR par.", sample = "Out-Sample")

colnames(df.maj.naive) = c("precision", "recall", "aupr", "model","sample")
colnames(df.maj.ms01) = c("precision", "recall", "aupr", "model","sample")
colnames(df.maj.ms10) = c("precision", "recall", "aupr", "model","sample")
colnames(df.maj.kor) = c("precision", "recall", "aupr", "model","sample")
colnames(df.maj.korp) = c("precision", "recall", "aupr", "model","sample")

gg.maj = rbind(df.maj.naive, df.maj.ms01, df.maj.ms10, df.maj.kor, df.maj.korp)

ggData = rbind(gg.in,  gg.maj)


ggData$model_f <- factor(ggData$model, levels = c("Naive Model", "MS 2001", "MS 2010","KOR","KOR par."))

aupr.list <- unique(ggData[,c("aupr","model","sample")])[,1]
aupr.list <- round(aupr.list, 2)
aupr.text = data.frame(paste0("AUPR = \n", aupr.list))
colnames(aupr.text)[1] <- "label"
aupr.text$sample <- c(rep("In-Sample", 5), rep("Out-Sample",5))
aupr.text$model_f <- as.factor(c("Naive Model", "MS 2001", "MS 2010", "KOR", "KOR par."))
aupr.text$model_f <- factor(aupr.text$model_f, ordered = T)

# ------- FIGURE E2 -------- #
pdf("figureE2.pdf")
ggplot(ggData, aes(recall, precision)) + geom_line(size = 1, alpha = 0.7) +
  labs(x = "Recall", y = "Precision")  +
  facet_grid(model_f~sample)  + theme_bw() +
  geom_text(aes(.2, .5, label= label, group=NULL ), 
            color="black", data = aupr.text)
dev.off()
# -------------------------- #


###############################################################################
#                                  Figure 2                                   #
###############################################################################

# get TPR and FPR values for each model and sample
rocr.maj.naive = prediction(pty_ingov_naive.maj[["pr_ingov"]], pty_ingov_naive.maj$in_gov)
rocr.maj.ms01 = prediction(pty_ingov_ms01.maj[["pr_ingov"]], pty_ingov_ms01.maj$in_gov)
rocr.maj.ms10 = prediction(pty_ingov_ms10.maj[["pr_ingov"]], pty_ingov_ms10.maj$in_gov)
rocr.maj.kor = prediction(pty_ingov_kor.maj[["pr_ingov"]], pty_ingov_kor.maj$in_gov)
rocr.maj.korp = prediction(pty_ingov_korp.maj[["pr_ingov"]], pty_ingov_korp.maj$in_gov) 

perf.maj.naive = performance(rocr.maj.naive, "tpr", "fpr")
perf.maj.ms01 = performance(rocr.maj.ms01,  "tpr", "fpr")
perf.maj.ms10 = performance(rocr.maj.ms10,  "tpr", "fpr")
perf.maj.kor = performance(rocr.maj.kor,  "tpr", "fpr")
perf.maj.korp = performance(rocr.maj.korp,  "tpr", "fpr")

rocr.in.naive = prediction(pty_ingov_naive.in[["pr_ingov"]], pty_ingov_naive.in$in_gov)
rocr.in.ms01 = prediction(pty_ingov_ms01.in[["pr_ingov"]], pty_ingov_ms01.in$in_gov)
rocr.in.ms10 = prediction(pty_ingov_ms10.in[["pr_ingov"]], pty_ingov_ms10.in$in_gov)
rocr.in.kor = prediction(pty_ingov_kor.in[["pr_ingov"]], pty_ingov_kor.in$in_gov)
rocr.in.korp = prediction(pty_ingov_korp.in[["pr_ingov"]], pty_ingov_korp.in$in_gov) 

perf.in.naive = performance(rocr.in.naive, "tpr", "fpr")
perf.in.ms01 = performance(rocr.in.ms01,  "tpr", "fpr")
perf.in.ms10 = performance(rocr.in.ms10,  "tpr", "fpr")
perf.in.kor = performance(rocr.in.kor,  "tpr", "fpr")
perf.in.korp = performance(rocr.in.korp,  "tpr", "fpr")

require(flux)
# combine information on model performances across models and samples 
maj.auc.korp <- flux::auc(data.frame(perf.maj.korp@x.values)[,1], data.frame(perf.maj.korp@y.values)[,1])
maj.auc.kor <- flux::auc(data.frame(perf.maj.kor@x.values)[,1], data.frame(perf.maj.kor@y.values)[,1])
maj.auc.ms10 <- flux::auc(data.frame(perf.maj.ms10@x.values)[,1], data.frame(perf.maj.ms10@y.values)[,1])
maj.auc.ms01 <- flux::auc(data.frame(perf.maj.ms01@x.values)[,1], data.frame(perf.maj.ms01@y.values)[,1])
maj.auc.naive <- flux::auc(data.frame(perf.maj.naive@x.values)[,1], data.frame(perf.maj.naive@y.values)[,1])

in.auc.korp <- flux::auc(data.frame(perf.in.korp@x.values)[,1], data.frame(perf.in.korp@y.values)[,1])
in.auc.kor <- flux::auc(data.frame(perf.in.kor@x.values)[,1], data.frame(perf.in.kor@y.values)[,1])
in.auc.ms10 <- flux::auc(data.frame(perf.in.ms10@x.values)[,1], data.frame(perf.in.ms10@y.values)[,1])
in.auc.ms01 <- flux::auc(data.frame(perf.in.ms01@x.values)[,1], data.frame(perf.in.ms01@y.values)[,1])
in.auc.naive <- flux::auc(data.frame(perf.in.naive@x.values)[,1], data.frame(perf.in.naive@y.values)[,1])

maj.naive <- cbind(fpr = data.frame(perf.maj.naive@x.values), tpr = data.frame(perf.maj.naive@y.values), model = "Naive Model")
maj.ms01 <- cbind(fpr = data.frame(perf.maj.ms01@x.values), tpr = data.frame(perf.maj.ms01@y.values), model = "MS 2001")
maj.ms10 <- cbind(fpr = data.frame(perf.maj.ms10@x.values), tpr = data.frame(perf.maj.ms10@y.values), model = "MS 2010")
maj.kor <- cbind(fpr = data.frame(perf.maj.kor@x.values), tpr = data.frame(perf.maj.kor@y.values), model = "KOR")
maj.korp <- cbind(fpr = data.frame(perf.maj.korp@x.values), tpr = data.frame(perf.maj.korp@y.values), model = "KOR par.")

in.naive <- cbind(fpr = data.frame(perf.in.naive@x.values), tpr = data.frame(perf.in.naive@y.values), model = "Naive Model")
in.ms01 <- cbind(fpr = data.frame(perf.in.ms01@x.values), tpr = data.frame(perf.in.ms01@y.values), model = "MS 2001")
in.ms10 <- cbind(fpr = data.frame(perf.in.ms10@x.values), tpr = data.frame(perf.in.ms10@y.values), model = "MS 2010")
in.kor <- cbind(fpr = data.frame(perf.in.kor@x.values), tpr = data.frame(perf.in.kor@y.values), model = "KOR")
in.korp <- cbind(fpr = data.frame(perf.in.korp@x.values), tpr = data.frame(perf.in.korp@y.values), model = "KOR par.")

# add column names to data.frame objects for models and samples
colnames(maj.naive) <- c("FPR", "TPR", "model")
colnames(maj.ms01) <- c("FPR", "TPR", "model")
colnames(maj.ms10) <- c("FPR", "TPR", "model")
colnames(maj.kor) <- c("FPR", "TPR", "model")
colnames(maj.korp) <- c("FPR", "TPR", "model")

colnames(in.naive) <- c("FPR", "TPR", "model")
colnames(in.ms01) <- c("FPR", "TPR", "model")
colnames(in.ms10) <- c("FPR", "TPR", "model")
colnames(in.kor) <- c("FPR", "TPR", "model")
colnames(in.korp) <- c("FPR", "TPR", "model")

# combine performance across models and samples
ggdat.maj = rbind(maj.naive, maj.ms01, maj.ms10, maj.kor, maj.korp)
ggdat.in = rbind(in.naive, in.ms01, in.ms10, in.kor, in.korp)

# add the two samples into one data.frame
ggData2 = rbind(cbind(ggdat.in, sample = "In-Sample"),
                cbind(ggdat.maj, sample = "Out-Sample"))

ggData2$model_f <- factor(ggData2$model, levels = c("Naive Model","MS 2001", "MS 2010","KOR","KOR par."))

# create vector for AUC values across models and samples
auc.list = c(in.auc.naive, in.auc.ms01, in.auc.ms10, in.auc.kor, in.auc.korp,
             maj.auc.naive, maj.auc.ms01, maj.auc.ms10, maj.auc.kor, maj.auc.korp)
auc.list = round(auc.list, 2)

# add trailing 0 to even decimals 
for(i in 1:length(auc.list)){if( nchar(auc.list[i]) == 3) {auc.list[i] <- paste0(auc.list [i],"0")}}

auc.list <- data.frame(paste0("AUC = ", auc.list))
colnames(auc.list)[1] <- "label"
auc.list$sample <- factor(c(rep("In-Sample",5),  rep("Out-Sample",5)))
auc.list$sample <- as.factor(auc.list$sample)
auc.list$model <- as.factor(c("Naive Model", "MS 2001", "MS 2010", "KOR", "KOR par.",
                              "Naive Model", "MS 2001", "MS 2010", "KOR", "KOR par."))

auc.list$model_f <- factor(auc.list$model, levels = c("Naive Model","MS 2001","MS 2010","KOR", "KOR par."))

# --------- FIGURE 2 -------- #
pdf("figure2.pdf")
ggplot(ggData2 , aes(x = FPR, y = TPR)) + 
  geom_line(size = 1, alpha = 0.7) +
  labs(x = "FPR", y = "TPR") + 
  facet_grid(model_f~sample) + theme_bw() +
  geom_text(aes(.75, .2, label= label , group=NULL ), 
            color="black", data = auc.list)
dev.off()
# --------------------------- # 

###############################################################################
#                                  Table D1                                  #
###############################################################################

# Coalition Level #
getPr <- function(dat, pred.var = "pr_ms7"){
  
  mps <- unlist(strsplit(pred.var, "_"))[2]
  pr_vals <- dat[[pred.var]]
  
  # maximal predicted probability
  dat[[paste("maxpr", mps, sep = "_")]] <- max(pr_vals)
  
  # Difference between highest and second highest predicted probability
  dat[[paste("pr2d", mps, sep = "_")]] <- -diff(sort(pr_vals, decreasing = TRUE))[1]
  
  return(dat)
  
}

# CONFUSION MATRIX 
for(i in 1:5){
  var = c("pr_naive", "pr_ms01", "pr_ms10","pr_kor","pr_korp")[i]
  ### Confusion matrix
  
  TD <- ddply(TD, .(cabinet_id), getPr, pred.var =  var) 
  
  head(TD)
  TD[[paste0("pp_",var)]] <- as.numeric(TD[[var]] == TD[[paste0("max",var)]])
  
  fconf_mat <- matrix(ncol = 2, nrow = 4)
  
  # make point predictions
  tab <- table(TD[!is.na(TD$pr_ms01), paste0("pp_",var)], TD[!is.na(TD$pr_ms01), "realg"])
  fconf_mat[c(1,3), c(1, 2)] <- tab
  ctab <- prop.table(tab, margin = 2)
  fconf_mat[c(2,4), c(1, 2)] <- ctab
  
  stargazer(fconf_mat
            , digits = 3
            #, out = "./report/tab_fcmat.tex"
            , out.header = FALSE)
  
}


TD[[paste0("pp_",var)]] <- as.numeric(TD[[var]] == TD[[paste0("max",var)]])

## data for upper part of Table D1
# point estimates, incl. majority situations #
for(i in 1:5){
  var = c("pr_maj_naive", "pr_maj_ms01", "pr_maj_ms10","pr_maj_kor","pr_maj_korp")[i]
  ### Confusion matrix
  
  TDA.maj <- ddply(TD.maj, .(cabinet_id), getPr, pred.var =  var) 
  
  head(TDA.maj)
  TDA.maj[[paste0("pp_",var)]] <- as.numeric(TDA.maj[[var]] == TDA.maj[[paste0("maxpr_maj")]])
  
  #TDA.maj[["pp_pr_korp"]] <- as.numeric(TDA.maj[[var]] == TDA.maj[["maxpr_korp"]])
  
  fconf_mat <- matrix(ncol = 2, nrow = 14)
  
  # make point predictions
  tab <- table(TDA.maj[!is.na(TDA.maj$pr_ms01), paste0("pp_",var)], TDA.maj[!is.na(TDA.maj$pr_ms01), "realg"])
  fconf_mat[c(1,3), c(1, 2)] <- tab
  ctab <- prop.table(tab, margin = 2)
  fconf_mat[c(2,4), c(1, 2)] <- ctab
  fconf_mat[5, 1] <- round(tab[2,2]/(tab[2,2]+tab[2,1]),2)
  fconf_mat[6, 1] <- round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
  fconf_mat[7, 1] <- as.numeric(as.character(aupr.maj[[i]]))
  
  if(var == "pr_maj_naive"){
  tab <- table(as.numeric(pty_ingov_naive.maj$pr_ingov >.5), pty_ingov_naive.maj$in_gov)
  fconf_mat[c(8,10), c(1, 2)] <- tab
  fconf_mat[c(9,11), c(1, 2)] <- prop.table(tab,margin = 2)
  fconf_mat[12, 1] <- round(tab[2,2]/(tab[2,2]+tab[2,1]),2)
  fconf_mat[13, 1] <- round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
  fconf_mat[14, 1] <- as.numeric(gsub("AUC = ","",as.character(auc.list[auc.list$sample=="Out-Sample",1]))[i])
  }
  if(var == "pr_maj_ms01"){
    tab <- table(as.numeric(pty_ingov_ms01.maj$pr_ingov >.5), pty_ingov_ms01.maj$in_gov)
    fconf_mat[c(8,10), c(1, 2)] <- tab
    fconf_mat[c(9,11), c(1, 2)] <- prop.table(tab,margin = 2)
    fconf_mat[12, 1] <- round(tab[2,2]/(tab[2,2]+tab[2,1]),2)
    fconf_mat[13, 1] <- round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
    fconf_mat[14, 1] <- as.numeric(gsub("AUC = ","",as.character(auc.list[auc.list$sample=="Out-Sample",1]))[i])
  }
  if(var == "pr_maj_ms10"){
    tab <- table(as.numeric(pty_ingov_ms10.maj$pr_ingov >.5), pty_ingov_ms10.maj$in_gov)
    fconf_mat[c(8,10), c(1, 2)] <- tab
    fconf_mat[c(9,11), c(1, 2)] <- prop.table(tab,margin = 2)
    fconf_mat[12, 1] <- round(tab[2,2]/(tab[2,2]+tab[2,1]),2)
    fconf_mat[13, 1] <- round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
    fconf_mat[14, 1] <- as.numeric(gsub("AUC = ","",as.character(auc.list[auc.list$sample=="Out-Sample",1]))[i])
  }
  if(var == "pr_maj_kor"){
    tab <- table(as.numeric(pty_ingov_kor.maj$pr_ingov >.5), pty_ingov_kor.maj$in_gov)
    fconf_mat[c(8,10), c(1, 2)] <- tab
    fconf_mat[c(9,11), c(1, 2)] <- prop.table(tab,margin = 2)
    fconf_mat[12, 1] <- round(tab[2,2]/(tab[2,2]+tab[2,1]),2)
    fconf_mat[13, 1] <- round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
    fconf_mat[14, 1] <- as.numeric(gsub("AUC = ","",as.character(auc.list[auc.list$sample=="Out-Sample",1]))[i])
  }
  if(var == "pr_maj_korp"){
    tab <- table(as.numeric(pty_ingov_korp.maj$pr_ingov >.5), pty_ingov_korp.maj$in_gov)
    fconf_mat[c(8,10), c(1, 2)] <- tab
    fconf_mat[c(9,11), c(1, 2)] <- prop.table(tab,margin = 2)
    fconf_mat[12, 1] <- round(tab[2,2]/(tab[2,2]+tab[2,1]),2)
    fconf_mat[13, 1] <- round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
    fconf_mat[14, 1] <- as.numeric(gsub("AUC = ","",as.character(auc.list[auc.list$sample=="Out-Sample",1]))[i])
  }
  
  
  # Table D1
  stargazer(fconf_mat
            , digits = 3
            , out = paste0("tableD1_",var,".tex")
            , out.header = FALSE)
  
}
